The Encoder (PART 1)¶
The encoder section of the transformer was coded in the Lecture 4 example. Headings have been added for clarity.
1.1 Setup Dependencies¶
pip install js2py
Collecting js2py
Downloading Js2Py-0.74-py3-none-any.whl (1.0 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 17.6 MB/s eta 0:00:00a 0:00:01
Collecting tzlocal>=1.2 (from js2py)
Obtaining dependency information for tzlocal>=1.2 from https://files.pythonhosted.org/packages/97/3f/c4c51c55ff8487f2e6d0e618dba917e3c3ee2caae6cf0fbb59c9b1876f2e/tzlocal-5.2-py3-none-any.whl.metadata
Downloading tzlocal-5.2-py3-none-any.whl.metadata (7.8 kB)
Requirement already satisfied: six>=1.10 in /opt/conda/lib/python3.10/site-packages (from js2py) (1.16.0)
Collecting pyjsparser>=2.5.1 (from js2py)
Downloading pyjsparser-2.7.1.tar.gz (24 kB)
Preparing metadata (setup.py) ... done
Downloading tzlocal-5.2-py3-none-any.whl (17 kB)
Building wheels for collected packages: pyjsparser
Building wheel for pyjsparser (setup.py) ... done
Created wheel for pyjsparser: filename=pyjsparser-2.7.1-py3-none-any.whl size=25984 sha256=2503c8b6f164133b9c5d95301e30306d249bfbc918e3142c287d5dbbfb1ad18e
Stored in directory: /root/.cache/pip/wheels/5e/81/26/5956478df303e2bf5a85a5df595bb307bd25948a4bab69f7c7
Successfully built pyjsparser
Installing collected packages: pyjsparser, tzlocal, js2py
Successfully installed js2py-0.74 pyjsparser-2.7.1 tzlocal-5.2
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
Note: you may need to restart the kernel to use updated packages.
import js2py
%%javascript
require.config({
paths: {
d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',
jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
}
});
1.2 Tokenization¶
from transformers import AutoTokenizer
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BertModel.from_pretrained(model_ckpt)
text = "time flies like an arrow"
show(model, "bert", tokenizer, text, display_mode="light", layer=0, head=8)
Downloading (…)okenizer_config.json: 0%| | 0.00/28.0 [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 0%| | 0.00/570 [00:00<?, ?B/s]
Downloading (…)solve/main/vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]
100% 433/433 [00:00<00:00, 1499697.47B/s] 100% 440473133/440473133 [00:09<00:00, 45591989.25B/s]
from transformers import AutoTokenizer
model_ckpt = "bert-base-uncased"
text = "time flies like an arrow"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
inputs.input_ids
tensor([[ 2051, 10029, 2066, 2019, 8612]])
from torch import nn
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_ckpt)
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
token_emb
Embedding(30522, 768)
inputs_embeds = token_emb(inputs.input_ids)
inputs_embeds.size()
torch.Size([1, 5, 768])
Self-Attention - Encoder (Step by Step)¶
import torch
from math import sqrt
query = key = value = inputs_embeds
dim_k = key.size(-1)
scores = torch.bmm(query, key.transpose(1,2)) / sqrt(dim_k)
scores.size()
torch.Size([1, 5, 5])
import torch.nn.functional as F
weights = F.softmax(scores, dim=-1)
weights.sum(dim=-1)
tensor([[1., 1., 1., 1., 1.]], grad_fn=<SumBackward1>)
attn_outputs = torch.bmm(weights, value)
attn_outputs.shape
torch.Size([1, 5, 768])
1.3 Scaled Dot Product¶
Scaled Dot Product - Definition with all previous steps in one.
def scaled_dot_product_attention(query, key, value):
dim_k = query.size(-1)
scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
weights = F.softmax(scores, dim=-1)
return torch.bmm(weights, value)
1.4 Multi-headed attention¶
Attention Head
class AttentionHead(nn.Module):
def __init__(self, embed_dim, head_dim):
super().__init__()
self.q = nn.Linear(embed_dim, head_dim)
self.k = nn.Linear(embed_dim, head_dim)
self.v = nn.Linear(embed_dim, head_dim)
def forward(self, hidden_state):
attn_outputs = scaled_dot_product_attention(
self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
return attn_outputs
Multi-Head Attention
class MultiHeadAttention(nn.Module):
def __init__(self, config):
super().__init__()
embed_dim = config.hidden_size
num_heads = config.num_attention_heads
head_dim = embed_dim // num_heads
self.heads = nn.ModuleList(
[AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
)
self.output_linear = nn.Linear(embed_dim, embed_dim)
def forward(self, hidden_state):
x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
x = self.output_linear(x)
return x
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(inputs_embeds)
attn_output.size()
torch.Size([1, 5, 768])
from bertviz import head_view
from transformers import AutoModel
model = AutoModel.from_pretrained(model_ckpt, output_attentions=True)
sentence_a = "time flies like an arrow"
sentence_b = "fruit flies like a banana"
viz_inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt')
attention = model(**viz_inputs).attentions
sentence_b_start = (viz_inputs.token_type_ids == 0).sum(dim=1)
tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])
head_view(attention, tokens, sentence_b_start, heads=[8])
Downloading model.safetensors: 0%| | 0.00/440M [00:00<?, ?B/s]
1.5 The Feed-Forward Layer¶
class FeedForward(nn.Module):
def __init__(self, config):
super().__init__()
self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
self.gelu = nn.GELU()
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, x):
x = self.linear_1(x)
x = self.gelu(x)
x = self.linear_2(x)
x = self.dropout(x)
return x
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_outputs)
ff_outputs.size()
torch.Size([1, 5, 768])
1.6 Encoder Layer¶
class TransformerEncoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
self.attention = MultiHeadAttention(config)
self.feed_forward = FeedForward(config)
def forward(self, x):
# Apply layer normalization and then copy input into query, key, value
hidden_state = self.layer_norm_1(x)
# Apply attention with a skip connection
x = x + self.attention(hidden_state)
# Apply feed-forward layer with a skip connection
x = x + self.feed_forward(self.layer_norm_2(x))
return x
encoder_layer = TransformerEncoderLayer(config)
inputs_embeds.shape, encoder_layer(inputs_embeds).size()
(torch.Size([1, 5, 768]), torch.Size([1, 5, 768]))
1.7 Positional Embeddings¶
class Embeddings(nn.Module):
def __init__(self, config):
super().__init__()
self.token_embeddings = nn.Embedding(config.vocab_size,
config.hidden_size)
self.position_embeddings = nn.Embedding(config.max_position_embeddings,
config.hidden_size)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout()
def forward(self, input_ids):
# Create position IDs for input sequence
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
# Create token and position embeddings
token_embeddings = self.token_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
# Combine token and position embeddings
embeddings = token_embeddings + position_embeddings
embeddings = self.layer_norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
embedding_layer = Embeddings(config)
embedding_layer(inputs.input_ids).size()
torch.Size([1, 5, 768])
1.8 FULL Encoder¶
class TransformerEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.embeddings = Embeddings(config)
self.layers = nn.ModuleList([TransformerEncoderLayer(config)
for _ in range(config.num_hidden_layers)])
def forward(self, x):
x = self.embeddings(x)
for layer in self.layers:
x = layer(x)
return x
encoder = TransformerEncoder(config)
encoder(inputs.input_ids).size()
torch.Size([1, 5, 768])
The Decoder (PART 2)¶
Decoder Format:
- Embeddings
- Decoder stack**
- Classification head
** for the decoder STACK, we need:
- 1 self attention (using masked dot product)
- 1 cross-attention attention (it needs to accept K,V from encoder as input).
- feed forward network
2.1 Self Attention (w. Mask)¶
seq_len = inputs.input_ids.size(-1)
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0)
mask[0]
tensor([[1., 0., 0., 0., 0.],
[1., 1., 0., 0., 0.],
[1., 1., 1., 0., 0.],
[1., 1., 1., 1., 0.],
[1., 1., 1., 1., 1.]])
scores.masked_fill(mask == 0, -float("inf"))
tensor([[[26.6093, -inf, -inf, -inf, -inf],
[ 0.4334, 26.9444, -inf, -inf, -inf],
[-0.8475, -0.5703, 25.1118, -inf, -inf],
[ 1.4294, 0.0882, 1.0268, 27.6777, -inf],
[-0.6510, -0.5275, -1.4019, -0.2303, 26.0880]]],
grad_fn=<MaskedFillBackward0>)
def scaled_dot_product_attention(query, key, value, mask=None):
dim_k = query.size(-1)
scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, float("-inf"))
weights = F.softmax(scores, dim=-1)
return weights.bmm(value)
2.2 Cross-Attention (Encoder-Decoder Attention Layer)¶
This is brand new. We use existing LayerNorm and MultiHeadAttention in the attributes section. But, in the forward function, we specify that the query comes from the decoder, and that the key/value comes from the encoder output.
class CrossAttentionLayer(nn.Module):
def __init__(self, config):
#super(CrossAttentionLayer, self).__init__()
super().__init__()
self.layer_norm = nn.LayerNorm(config.hidden_size)
self.attention = MultiHeadAttention(config)
def forward(self, decoder_hidden, encoder_output, mask):
# 1.) Normalize decoder values (from decoder input).
# 2.) Then, pass decoder values (query) and values from encoder (key, value) to ...
# 3.) a multi-head attention module (apply mask).
# 4.) Include skip connection by combining decoder values and cross attention.
decoder_hidden = self.layer_norm(decoder_hidden)
context = self.attention(query=decoder_hidden, key=encoder_output, value=encoder_output, mask=mask)
attn_outputs = decoder_hidden + context
return attn_outputs
class FeedForward(nn.Module):
def __init__(self, config):
super().__init__()
self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
self.gelu = nn.GELU()
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, x):
x = self.linear_1(x)
x = self.gelu(x)
x = self.linear_2(x)
x = self.dropout(x)
return x
2.3 Decoder Layer (Stack) - Put the pieces together.¶
class TransformerDecoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
#Decoder Attention 1: Self-Attention
self.attention = MultiHeadAttention(config)
#Decoder Attention 2: Cross-Attention
self.cross_attention = CrossAttentionLayer(config)
#Decoder Feed Forward Layer
self.feed_forward = FeedForward(config)
def forward(self, x, encoder_output, mask):
# Apply layer normalization and then copy input into query, key, value
hidden_state = self.layer_norm_1(x)
# Apply self-attention with a skip connection
x = x + self.attention(hidden_state)
#Apply cross-attention with a skip connection
x = x + self.cross_attention(hidden_state, encoder_output, mask)
# Apply feed-forward layer with a skip connection
x = x + self.feed_forward(self.layer_norm_2(x))
return x
2.4 FULL Decoder¶
class TransformerDecoder(nn.Module):
def __init__(self, config):
super().__init__()
self.embeddings = Embeddings(config)
self.layers = nn.ModuleList([TransformerDecoderLayer(config)
for _ in range(config.num_hidden_layers)])
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, x):
x = self.embeddings(x)
x = self.dropout(x)
for layer in self.layers:
x = layer(x)
return x
(D) Classification Head¶
The Full Transformer (PART 3)¶
Using the Transformer for Sequence Classification section and L4 - Classification Head as examples.
3.1 FULL Transformer¶
class Transformer(nn.Module):
def __init__(self, config):
super().__init__()
self.encoder = TransformerEncoder(config)
self.decoder = TransformerDecoder(config)
#self.final_layer =
def forward(self, inputs):
context, x = inputs
#These "encoder outputs" will go into decoder (cross-attention)
context = self.encoder(context) # (batch_size, context_len, d_model)
decoder_output = self.decoder(x, context, mask) # (batch_size, target_len, d_model)
# Final linear layer output.
#logits = self.final_layer(x) # (batch_size, target_len, target_vocab_size)
# Return the final output and the attention weights.
return decoder_output
TRAIN Transformer (PART 4)¶
Using the Transformer for Sequence Classification section as an example.
4.1 Hyperparameters¶
#Number of layers:
num_hidden_layers = 4
#Dimensionality of Embeddings (embed_dim)
embed_dim = 128
#number of attention heads
num_attention_heads = 8
#Dimensionality for feed-forward
dff = 512
#Dropout Rate
dropout_rate = 0.1
#hidden_dropout_prob
4.2 Load data. Tokenize text.¶
To train this transformer, we are going to reference this HuggingFace Translation Tutorial.
Download Data
#Install these packages in Terminal:
#pip install transformers datasets evaluate sacrebleu
from huggingface_hub import notebook_login
notebook_login()
VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…
from datasets import load_dataset
books = load_dataset("opus_books", "en-fr")
Downloading builder script: 0%| | 0.00/6.08k [00:00<?, ?B/s]
Downloading metadata: 0%| | 0.00/161k [00:00<?, ?B/s]
Downloading readme: 0%| | 0.00/20.5k [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/12.0M [00:00<?, ?B/s]
Generating train split: 0%| | 0/127085 [00:00<?, ? examples/s]
books = books["train"].train_test_split(test_size=0.2)
books["train"][0]
{'id': '126546',
'translation': {'en': 'The paralysed woman became of daily use to her. She served as a sort of praying-desk, as a piece of furniture in front of which Therese could fearlessly confess her faults and plead for forgiveness.',
'fr': "La paralytique lui devint d'un usage journalier; elle lui servait en quelque sorte de prie-Dieu, de meuble devant lequel elle pouvait sans crainte avouer ses fautes et en demander le pardon."}}
from transformers import AutoTokenizer
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
Downloading (…)okenizer_config.json: 0%| | 0.00/2.32k [00:00<?, ?B/s]
Downloading (…)ve/main/spiece.model: 0%| | 0.00/792k [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json: 0%| | 0.00/1.39M [00:00<?, ?B/s]
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "
def preprocess_function(examples):
inputs = [prefix + example[source_lang] for example in examples["translation"]]
targets = [example[target_lang] for example in examples["translation"]]
model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
return model_inputs
tokenized_books = books.map(preprocess_function, batched=True)
Map: 0%| | 0/101668 [00:00<?, ? examples/s]
Map: 0%| | 0/25417 [00:00<?, ? examples/s]
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
4.3 Instantiate Transformer¶
transformer = Transformer(
num_hidden_layers=num_hidden_layers,
embed_dim = embed_dim
num_attention_heads=num_attention_heads,
dff=dff,
input_vocab_size=tokenizers.es.get_vocab_size().numpy(),
target_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
dropout_rate=dropout_rate)